{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python36664bitea6884f10f474b21a2a2f022451e0d09",
   "display_name": "Python 3.6.6 64-bit",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.feature_selection import SelectKBest,chi2,RFE\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "df = pd.read_csv(\"data/Heart_Disease_Prediction.csv\")\n",
    "print(df.shape)\n",
    "\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "label = df[\"Heart Disease\"]\n",
    "df.drop(\"Heart Disease\", axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(label.value_counts())\n",
    "label.value_counts().plot(kind=\"bar\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = [\"Sex\", \"Chest pain type\", \"FBS over 120\", \"EKG results\", \"Exercise angina\", \"Slope of ST\", \"Number of vessels fluro\", \"Thallium\"]\n",
    "df[categorical_features] = df[categorical_features].astype(\"category\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "continuous_features = set(df.columns) - set(categorical_features)\n",
    "scaler = MinMaxScaler()\n",
    "df_norm = df.copy()\n",
    "df_norm[list(continuous_features)] = scaler.fit_transform(df[list(continuous_features)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = SelectKBest(k=5, score_func=chi2).fit_transform(df_norm, label)\n",
    "X_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)\n",
    "X_new = rfe.fit_transform(df_norm, label)\n",
    "X_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "clf = RandomForestClassifier()\n",
    "clf.fit(df_norm, label)\n",
    "# create a figure to plot a bar, where x axis is features, and Y indicating the importance of each feature\n",
    "plt.figure(figsize=(12,12))\n",
    "plt.bar(df_norm.columns, clf.feature_importances_)\n",
    "plt.xticks(rotation=45)"
   ]
  }
 ]
}